library(dplyr)
library(ggplot2)
library(ggpubr)
library(emmeans)
library(ggsignif)
library(here)
library(tidyverse)
library(grid)
library(PNWColors)
theme_eb <- theme_classic(base_size = 16) +
theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14)
)
theme_set(theme_eb)
pal3 <- pnw_palette("Sunset", 3, type=c("discrete"))
pal7 <- pnw_palette("Starfish", 7, type=c("discrete"))
here::here()
[1] "/Users/emma/Library/CloudStorage/OneDrive-SharedLibraries-IndianaUniversity/Lennon, Jay - 0000_Bueren/Projects/LifeStyle/PhageLifestyleSporulation"
### note the quartz script needs to be fixed, the headers are janky and wrong (had to be manually fixed)
ParS <- read.delim2(here("02_motifs/ParS/data/01_out_01_indiv_hmm1_ParS.tsv"))
ParS$q.value<- as.numeric(ParS$q.value)
ParS$p.value<- as.numeric(ParS$p.value)
ParS$score<- as.numeric(ParS$score)
### remove any duplicate motifs (occasionally a motif will be a palindromic and hit both + and - strands)
ParS <- ParS %>%
group_by(sequence_name, start, stop) %>% # group by coordinates
slice_max(order_by = score, n = 1) %>% # keep only the row with highest score
ungroup()
### phi3T KY030782, spbeta AF020713
## redrock (actino phage with ParABS) GU339467, uses parS sites but of a different type than sporulation ParS
## combine phage metadata with ParS hits. Label any phage that had no ParS hits with "no_hit" in metadata
inph <- read.csv(here("00_data", "inphared_db", "14Apr2025_knownsporestatus.csv"), row.names = 1)
inph$lifestyle <- ifelse(inph$lifestyle=="Temp", "Temperate", inph$lifestyle)
inph$bac.host <- ifelse(inph$sporulation=="Spor", "Sporulating Bacillota", "Nonsporulating Bacillota") #"Asporogenous Bacillota")
inph$bac.host <- ifelse(inph$newgtdb_Phylum=="Bacillota", inph$bac.host, "Non-Bacillota")
inph <- unite(inph, nice, c("lifestyle", "bac.host"), sep=" Phages of ", remove = FALSE )
meta.cats <- unique(select(inph, host_phage_spor, bac.host, lifestyle, nice, sporulation))
#inph <- select(inph, Accession, Host, Genome.Length..bp., gtdb_f, f_spor, newgtdb_Phylum, host_phage_spor, phage_type, sporulation, lifestyle, nice)
all <- merge(inph, ParS, by.x="Accession", by.y="sequence_name", all.x=TRUE, all.y=FALSE)
all$motif_id[is.na(all$motif_id)] <- "no_hit"
### create binary hit column of 1 for ParS hit, 0 if no hit
all$hit <- ifelse(all$motif_id=="ParS_BSub", 1, 0)
all$q.value[is.na(all$q.value)] <- 1
### remove duplicates (keep only reference strains)
### currently this only works on bacilliota, 99.5% ANI over 100% alignment threshold
## rest of inphared is pending
all <- subset(all, all$ref=="not_run" | all$ref=="ref_seq")
### threshold testing
library(dplyr)
library(ggplot2)
# Define thresholds
thresholds <- 10^seq(-6, -1, by = 1) # from 1e-6 to 1e-1
thresholds <- c(thresholds, 0.05, 1) # add 0.05 explicitly if desired
results <- lapply(thresholds, function(th) {
all %>%
group_by(Accession, host_phage_spor) %>%
summarise(
pos.ParS.hit = max(ifelse(!is.na(q.value) & q.value <= th, 1, 0)),
.groups = "drop"
) %>%
group_by(host_phage_spor) %>%
summarise(
Phage_has_ParS = sum(pos.ParS.hit),
total.phage = n(),
.groups = "drop"
) %>%
mutate(
ParS.pos.perc = Phage_has_ParS / total.phage * 100,
threshold = th
)
}) %>% bind_rows()
results.fig <- merge(results, meta.cats, by="host_phage_spor")
# Plot
p<- ggplot(results.fig, aes(x = threshold, y = ParS.pos.perc, color = bac.host, linetype = lifestyle)) +
geom_line(size = 1.2) +
geom_point() +
scale_x_log10() + # log scale for q-values
labs(
x = "False Positive (q-value) Threshold",
y = "% Phages with 1 or more ParS Binding Site"
) + labs(linetype = "Phage Lifestyle", color="Bacterial Host") +geom_vline(xintercept = 1e-4, linetype = "dotted") #+ theme(legend.position="none")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
Please use `linewidth` instead.
p + theme(legend.position = c(0.125, 0.82)) + scale_color_manual(values = rev(pal3))
Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2 3.5.0.
Please use the `legend.position.inside` argument of `theme()` instead.
ggsave(here("02_motifs/ParS/QValueTresh1e4_line.png"),width=7, height=7)
#### Q FILTERING
ParS.qual <- all
#ParS.qual$hit <- ifelse(ParS.qual$q.value<1e-4, 1, 0)
ParS.qual$hit <- ifelse(ParS.qual$q.value<1e-4, 1, 0)
## create binary list of phages w/ and w/out ParS hits
ParS.pos <- ParS.qual %>% group_by(Accession, host_phage_spor) %>%
summarise(total.ParS.hits = sum(hit), pos.ParS.hit = max(hit), .groups = "drop") #%>%
ParS.pos %>% count(host_phage_spor)
## create binary list of phages w/ and w/out ParS hits
ParS.sanity <- ParS.qual %>% group_by(Accession, host_phage_spor, Host, Description, Lowest.Taxa, Genus, Family, Genome.Length..bp., molGC....) %>%
summarise(total.ParS.hits = sum(hit), pos.ParS.hit = max(hit), .groups = "drop") #%>%
only.pos <- subset(ParS.sanity, ParS.sanity$total.ParS.hits>0)
ParS.pos %>% count(host_phage_spor)
ParS.summary <- ParS.pos %>%
group_by(host_phage_spor) %>%
summarise(
total_ParS_hits = sum(total.ParS.hits, na.rm = TRUE),
Phage_has_ParS = sum(pos.ParS.hit, na.rm = TRUE),
total.phage = n(),
.groups = "drop"
) %>%
mutate(ParS.pos.perc = Phage_has_ParS / total.phage * 100)
ParS.bin <- ParS.pos[,c(1,2,4)]
ParS.hits <- subset(ParS.qual, ParS.qual$hit==1)
## find center phage genome (whole genome /2)
ParS.hits$seq.mdpt <- as.numeric(ParS.hits$Genome.Length..bp.)/2
## find center of motif
ParS.hits$motif.mdpt <- (ParS.hits$stop + ParS.hits$start )/ 2
### subtract midpoint motif from sequence midpoint to see how far away they are
ParS.hits <- ParS.hits %>%
mutate(mdpt.align = (motif.mdpt - seq.mdpt))
ParS.hits$mdpt.align.kbp <- ParS.hits$mdpt.align/1000
#### TO GET RELATIVE motif alignmen
# Relative position as fraction of genome length
# (-0.5 = start, 0 = center, +0.5 = end)
ParS.hits$rel.mdpt <- (ParS.hits$motif.mdpt - ParS.hits$seq.mdpt) / ParS.hits$Genome.Length..bp.
# Relative position from genome start (0 to 1)
ParS.hits$rel.frac <- ParS.hits$motif.mdpt / ParS.hits$Genome.Length..bp.
# Optionally convert to percentage
ParS.hits$rel.percent <- ParS.hits$rel.frac * 100
##Set specific order for bacterial hosts to appear on graphs
ParS.hits$nice <- factor(ParS.hits$nice, levels = c('Lytic Phages of Sporulating Bacillota', 'Temperate Phages of Sporulating Bacillota', 'Lytic Phages of Nonsporulating Bacillota', 'Temperate Phages of Nonsporulating Bacillota', 'Lytic Phages of Non-Bacillota', 'Temperate Phages of Non-Bacillota' ),ordered = TRUE)
ggplot(ParS.hits, aes(x = mdpt.align.kbp, fill = nice)) +
geom_histogram(binwidth = 5, position = "dodge") +
geom_vline(xintercept = 0, linetype = "dotted") +
scale_x_continuous(breaks = seq(-90, 90, by = 30)) +
labs(x = "Motif distance (kb) from center of phage genome", y = "Motif count") +
facet_wrap(~ nice, ncol = 2)+ ggtitle("Absolute Position of ParS Motif on Phage Genomes") + theme(legend.position="none")+ scale_fill_manual(values = pal7)
ggsave(here("02_motifs/ParS/figs/AbsoluteParSposition_1e4.png"), width=7, height=7)
ggplot(ParS.hits, aes(x = rel.mdpt, fill = nice)) +
geom_histogram(binwidth = 0.05, position = "dodge") +
geom_vline(xintercept = 0, linetype = "dotted") +
scale_x_continuous(breaks = seq(-0.5, 0.5, by = 0.25)) +
labs(x = "Motif position relative to center of phage genome", y = "Motif count") +
facet_wrap(~ nice, ncol = 2) + ggtitle("Relative Position of ParS Motif on Phage Genomes")+ theme(legend.position="none") + scale_fill_manual(values = pal7)
ggsave(here("02_motifs/ParS/figs/RelativeParSposition_1e4.png"), width=7, height=7)
# -----------------------------
pairwise_fisher_both_directions <- function(df, group_col, hits_col, total_col, p.adjust.method = "BH") {
groups <- unique(df[[group_col]])
combs <- combn(groups, 2, simplify = FALSE)
results <- data.frame(
group1 = character(),
group2 = character(),
odds_ratio = numeric(),
conf_low = numeric(),
conf_high = numeric(),
p.value = numeric(),
p.adj = numeric(),
stringsAsFactors = FALSE
)
pvals <- numeric()
for (c in combs) {
g1 <- df[df[[group_col]] == c[1], ]
g2 <- df[df[[group_col]] == c[2], ]
tab <- matrix(c(
g1[[hits_col]], g1[[total_col]] - g1[[hits_col]],
g2[[hits_col]], g2[[total_col]] - g2[[hits_col]]
), nrow = 2, byrow = TRUE)
rownames(tab) <- c(c[1], c[2])
colnames(tab) <- c("Present", "Absent")
ft <- fisher.test(tab)
# group1 vs group2
results <- rbind(results, data.frame(
group1 = c[1],
group2 = c[2],
odds_ratio = ft$estimate,
conf_low = ft$conf.int[1],
conf_high = ft$conf.int[2],
p.value = ft$p.value,
p.adj = NA
))
# group2 vs group1 (inverse OR)
results <- rbind(results, data.frame(
group1 = c[2],
group2 = c[1],
odds_ratio = 1 / ft$estimate,
conf_low = 1 / ft$conf.int[2],
conf_high = 1 / ft$conf.int[1],
p.value = ft$p.value,
p.adj = NA
))
pvals <- c(pvals, ft$p.value, ft$p.value)
}
results$p.adj <- p.adjust(pvals, method = p.adjust.method)
return(results)
}
# -----------------------------
# Compute Fisher tests
t <- pairwise_fisher_both_directions(
df = ParS.summary,
group_col = "host_phage_spor",
hits_col = "Phage_has_ParS",
total_col = "total.phage",
p.adjust.method = "BH"
)
# Specify pairs of interest (with direction)
selected_pairs <- list(
c("OtherPhyla_Lytic_NonSpor", "OtherPhyla_Temp_NonSpor"),
c("Bacillota_Lytic_NonSpor", "Bacillota_Temp_NonSpor"),
c("Bacillota_Temp_Spor", "Bacillota_Temp_NonSpor"),
c("Bacillota_Lytic_Spor", "OtherPhyla_Lytic_NonSpor"),
c("Bacillota_Lytic_Spor", "Bacillota_Lytic_NonSpor"),
c("Bacillota_Lytic_Spor", "Bacillota_Temp_Spor")
)
pairs_df <- do.call(rbind, lapply(selected_pairs, function(x) {
data.frame(group1 = x[1], group2 = x[2], stringsAsFactors = FALSE)
}))
pairwise_res_subset <- t %>%
semi_join(pairs_df, by = c("group1", "group2"))
pairwise_res_subset <- pairwise_res_subset %>%
mutate(
label_combined = ifelse(
p.adj < 0.05,
ifelse(
p.adj < 0.0001,
paste0("OR=", round(odds_ratio, 2), ", p<0.0001"),
paste0("OR=", round(odds_ratio, 2), ", p=", signif(p.adj, 3))
),
"n.s."
)
)
# -----------------------------
# Factor order
levels_order <- c(
'Bacillota_Lytic_Spor',
'Bacillota_Temp_Spor',
'Bacillota_Lytic_NonSpor',
'Bacillota_Temp_NonSpor',
'OtherPhyla_Lytic_NonSpor',
'OtherPhyla_Temp_NonSpor'
)
df_all <- ParS.pos
# Sample sizes
n_counts <- df_all %>%
group_by(host_phage_spor) %>%
summarise(
positive = sum(pos.ParS.hit == 1, na.rm = TRUE),
total = n(),
.groups = "drop"
) %>%
mutate(
perc = round(100 * positive / total, 1),
label = paste0(positive, "/", total, " (", perc, "%)")
)
# Base plot
pd <- position_jitter(width = 0.25, height = 0.05)
vp <- ggplot(df_all, aes(x = host_phage_spor, y = pos.ParS.hit, color = host_phage_spor)) +
geom_point(
aes(fill = host_phage_spor),
position = pd,
size = 2,
alpha = 0.6,
shape = 21,
color = "black",
stroke = 0.3
) +
scale_y_continuous(
#limits = c(-0.5, 10), # make room for bars above
breaks = c(0, 1),
labels = c("Absent", "Present")
) +
scale_x_discrete(limits = levels_order, drop = FALSE, labels = c(
'Bacillota_Lytic_Spor' = 'Lytic',
'Bacillota_Temp_Spor' = 'Temperate',
'Bacillota_Lytic_NonSpor' = 'Lytic',
'Bacillota_Temp_NonSpor' = 'Temperate',
'OtherPhyla_Lytic_NonSpor' = 'Lytic',
'OtherPhyla_Temp_NonSpor' = 'Temperate'
)) +
coord_cartesian(clip = "off") +
theme(
legend.position = "none",
plot.margin = margin(10, 10, 30, 10)
) +
xlab("") + ylab("Presence of ParS in phage genomes") +
geom_text(
data = n_counts,
aes(x = host_phage_spor, y = -0.3, label = paste0("n=", label)),
inherit.aes = FALSE,
size = 4
)
# -----------------------------
spor_label <- textGrob("Sporulating Bacilliota Host", gp = gpar(fontsize = 14, fontface = "bold"), x = 1/6, y = -0.08, just = "center")
nonspor_label <- textGrob("Non-Sporulating Bacilliota Host", gp = gpar(fontsize = 14, fontface = "bold"), x = 3/6, y = -0.08, just = "center")
other_label <- textGrob("Non-Bacilliota Host", gp = gpar(fontsize = 14, fontface = "bold"), x = 5/6, y = -0.08, just = "center")
vp <- vp + annotation_custom(spor_label) + annotation_custom(nonspor_label) + annotation_custom(other_label)
pairwise_res_subset$group1 <- factor(pairwise_res_subset$group1, levels = levels_order, ordered = TRUE)
pairwise_res_subset <- pairwise_res_subset %>%
arrange(factor(group1, levels = levels_order),
factor(group2, levels = levels_order)) %>%
mutate(y.position = max(df_all$pos.ParS.hit, na.rm = TRUE) +
0.5 + (nrow(.) - row_number()) * 0.3)
# Add significance bars
vp_fisher <- vp +
stat_pvalue_manual(
pairwise_res_subset,
label = "label_combined",
hide.ns = FALSE,
tip.length = 0.02,
size = 3.5
)
# Show plot
vp_fisher+ scale_fill_manual(values = pal7)
ggsave(here("02_motifs/ParS/figs/ParS_Enrichment_1e4.png"), width=7, height=7)
# -----------------------------
# Count positives and totals per group
n_counts <- df_all %>%
group_by(host_phage_spor) %>%
summarise(
positive = sum(total.ParS.hits > 0, na.rm = TRUE),
total = n(),
.groups = "drop"
) %>%
mutate(
perc = round(100 * positive / total, 1),
label = paste0(positive, "/", total, " (", perc, "%)")
)
# -----------------------------
kruskal_res <- kruskal.test(total.ParS.hits ~ host_phage_spor, data = df_all)
#print(kruskal_res)
# -----------------------------
# Define comparisons
comparisons_list <- combn(unique(df_all$host_phage_spor), 2, simplify = FALSE)
# Bar placement settings
offset <- 1 # push bars higher than max data
y_step <- 1.5 # vertical spacing
y_start <- max(df_all$total.ParS.hits, na.rm = TRUE) + offset
# Compute raw p-values
pairwise_res <- lapply(seq_along(comparisons_list), function(i){
comp <- comparisons_list[[i]]
x <- df_all$total.ParS.hits[df_all$host_phage_spor == comp[1]]
y <- df_all$total.ParS.hits[df_all$host_phage_spor == comp[2]]
wt <- wilcox.test(x, y)
data.frame(
group1 = comp[1],
group2 = comp[2],
p.value = wt$p.value,
y.position = y_start + (i - 1) * y_step
)
}) %>% bind_rows()
# Adjust p-values (BH)
pairwise_res$p.adj <- p.adjust(pairwise_res$p.value, method = "BH")
# Label formatting
pairwise_res <- pairwise_res %>%
mutate(
label = case_when(
p.adj < 0.0001 ~ "p < 0.0001",
p.adj < 0.05 ~ paste0("p = ", signif(p.adj, 3)),
TRUE ~ "n.s."
)
)
# -----------------------------
levels_order <- c(
'Bacillota_Lytic_Spor',
'Bacillota_Temp_Spor',
'Bacillota_Lytic_NonSpor',
'Bacillota_Temp_NonSpor',
'OtherPhyla_Lytic_NonSpor',
'OtherPhyla_Temp_NonSpor'
)
visualize
library(dplyr)
library(ggplot2)
library(rstatix)
Attaching package: ‘rstatix’
The following object is masked from ‘package:stats’:
filter
# --- Factor setup ---
levels_order <- c(
'Bacillota_Lytic_Spor',
'Bacillota_Temp_Spor',
'Bacillota_Lytic_NonSpor',
'Bacillota_Temp_NonSpor',
'OtherPhyla_Lytic_NonSpor',
'OtherPhyla_Temp_NonSpor'
)
df_all <- ParS.pos %>%
mutate(host_phage_spor = factor(host_phage_spor, levels = levels_order))
df_violin <- df_all %>%
filter(total.ParS.hits > 0)
# --- n counts ---
n_counts <- df_all %>%
group_by(host_phage_spor) %>%
summarise(
positive = sum(total.ParS.hits > 0, na.rm = TRUE),
total = n(),
.groups = "drop"
) %>%
mutate(
perc = round(100 * positive / total, 1),
label = paste0(positive, "/", total, " (", perc, "%)")
)
# --- Stats ---
# Kruskal–Wallis
kw_res <- kruskal_test(df_all, total.ParS.hits ~ host_phage_spor)
# Pairwise Wilcoxon with BH correction
wilc_test <- df_all %>%
pairwise_wilcox_test(total.ParS.hits ~ host_phage_spor,
p.adjust.method = "BH") #%>%
wilc_test <- wilc_test %>% unite(col = "test", c("group1", "group2"), remove = FALSE, sep = "_")
### subset tests you want to show
wilc_test <- subset(wilc_test, wilc_test$test=="Bacillota_Lytic_Spor_Bacillota_Temp_Spor" |
wilc_test$test=="Bacillota_Lytic_Spor_Bacillota_Lytic_NonSpor" |
wilc_test$test=="Bacillota_Lytic_Spor_OtherPhyla_Lytic_NonSpor" |
wilc_test$test=="Bacillota_Temp_Spor_Bacillota_Temp_NonSpor" |
wilc_test$test=="Bacillota_Temp_Spor_OtherPhyla_Temp_NonSpor" |
wilc_test$test=="Bacillota_Lytic_NonSpor_Bacillota_Temp_NonSpor" |
wilc_test$test=="OtherPhyla_Lytic_NonSpor_OtherPhyla_Temp_NonSpor")
max_y <- max(df_all$total.ParS.hits, na.rm = TRUE)
step <- .85
wilc_test <- wilc_test %>%
mutate(y.position = max_y + rev(seq(1, nrow(.)))*step)
# Make label: OR not relevant here (Wilcoxon), so p only
wilc_test <- wilc_test %>%
mutate(
p_label = ifelse(p.adj < 0.0001, "p < 0.0001",
ifelse(p.adj < 0.05, paste0("p=", signif(p.adj, 3)), "n.s."))
)
# --- Plot ---
pd <- position_jitter(width = 0.25, height = 0.25)
vp <- ggplot(df_all, aes(x = host_phage_spor,
y = total.ParS.hits,
color = host_phage_spor)) +
# Violin for positives
geom_violin(
data = df_violin,
inherit.aes = TRUE,
color = "black",
fill = NA,
scale = "width",
width = 0.8,
trim = TRUE
) +
# Jittered points
geom_point(
aes(fill = host_phage_spor),
position = pd,
size = 2,
alpha = 0.5,
shape = 21,
color = "black",
stroke = 0.3
) +
# Total n
geom_text(
data = n_counts,
aes(
x = host_phage_spor,
y = -0.75, # slightly below 0; adjust as needed
label = label
),
inherit.aes = FALSE,
size = 3,
vjust = 1 # align text above the y position
) +
# Add significance bars
stat_pvalue_manual(
wilc_test,
label = "p_label",
hide.ns = FALSE,
tip.length = 0.02,
size = 3.5
) +
# Axis order
scale_x_discrete(limits = levels_order, drop = FALSE, labels = c(
'Bacillota_Lytic_Spor' = 'Lytic',
'Bacillota_Temp_Spor' = 'Temperate',
'Bacillota_Lytic_NonSpor' = 'Lytic',
'Bacillota_Temp_NonSpor' = 'Temperate',
'OtherPhyla_Lytic_NonSpor' = 'Lytic',
'OtherPhyla_Temp_NonSpor' = 'Temperate'
)) +
# Give room for n labels and bars
#expand_limits(y = max(df_all$total.ParS.hits, na.rm = TRUE) * 4) +
coord_cartesian(clip = "off") +
theme(legend.position = "none",
plot.margin = margin(10, 10, 30, 10)) +
xlab("") + scale_y_continuous(limits=c(0,12.5,2), breaks=c(0,2,4,6,8))
#vp #+ expand_limits(y = c(-1, max(df_all$total.ParS.hits) * 1.2))
spor_label <- textGrob("Sporulating Bacilliota Host", gp = gpar(fontsize = 14, fontface = "bold"), x = 1/6, y = -0.08, just = "center")
nonspor_label <- textGrob("Non-Sporulating Bacilliota Host", gp = gpar(fontsize = 14, fontface = "bold"), x = 3/6, y = -0.08, just = "center")
other_label <- textGrob("Non-Bacilliota Host", gp = gpar(fontsize = 14, fontface = "bold"), x = 5/6, y = -0.08, just = "center")
vp.krusk <- vp + annotation_custom(spor_label) + annotation_custom(nonspor_label) + annotation_custom(other_label)+ scale_fill_manual(values = pal7)
vp.krusk
ggsave(here("02_motifs/ParS/figs/ParS_Hits_Kruskal_1e4.png"), width=7, height=7)
analyze_enrichment <- function(df, ref_treatment = "Enriched") {
# relevel the treatment factor
df$host_phage_spor <- relevel(factor(df$host_phage_spor), ref = ref_treatment)
# logistic regression: motif presence ~ treatment
model <- glm(pos.ParS.hit ~ host_phage_spor, family = binomial, data = df)
# estimated probabilities per treatment
emm <- emmeans(model, ~ host_phage_spor, type = "response")
list(
model_summary = summary(model),
probabilities = emm,
pairwise_tests = pairs(emm, adjust = "tukey")
)
}
library(emmeans)
res <- analyze_enrichment(ParS.bin, ref_treatment = "Bacillota_Lytic_Spor")
res$model_summary # logistic regression coefficients
Call:
glm(formula = pos.ParS.hit ~ host_phage_spor, family = binomial,
data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.8736 0.1638 -11.439 < 2e-16 ***
host_phage_sporBacillota_Lytic_NonSpor -2.6744 0.3730 -7.170 7.48e-13 ***
host_phage_sporBacillota_Temp_NonSpor -3.6431 0.4771 -7.636 2.24e-14 ***
host_phage_sporBacillota_Temp_Spor -0.7655 0.2640 -2.900 0.00373 **
host_phage_sporOtherPhyla_Lytic_NonSpor -2.6808 0.1914 -14.005 < 2e-16 ***
host_phage_sporOtherPhyla_Temp_NonSpor -2.7620 0.2031 -13.599 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2731.1 on 19984 degrees of freedom
Residual deviance: 2537.2 on 19979 degrees of freedom
AIC: 2549.2
Number of Fisher Scoring iterations: 8
res$probabilities # estimated motif probability per treatment
host_phage_spor prob SE df asymp.LCL asymp.UCL
Bacillota_Lytic_Spor 0.13313 0.01890 Inf 0.10024 0.17472
Bacillota_Lytic_NonSpor 0.01048 0.00347 Inf 0.00546 0.02001
Bacillota_Temp_NonSpor 0.00400 0.00179 Inf 0.00167 0.00958
Bacillota_Temp_Spor 0.06667 0.01290 Inf 0.04544 0.09680
OtherPhyla_Lytic_NonSpor 0.01041 0.00102 Inf 0.00859 0.01261
OtherPhyla_Temp_NonSpor 0.00961 0.00114 Inf 0.00761 0.01213
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
res$pairwise_tests # all pairwise comparisons
contrast odds.ratio SE df null z.ratio
Bacillota_Lytic_Spor / Bacillota_Lytic_NonSpor 14.5040 5.4100 Inf 1 7.170
Bacillota_Lytic_Spor / Bacillota_Temp_NonSpor 38.2086 18.2000 Inf 1 7.636
Bacillota_Lytic_Spor / Bacillota_Temp_Spor 2.1500 0.5680 Inf 1 2.900
Bacillota_Lytic_Spor / OtherPhyla_Lytic_NonSpor 14.5967 2.7900 Inf 1 14.005
Bacillota_Lytic_Spor / OtherPhyla_Temp_NonSpor 15.8310 3.2200 Inf 1 13.599
Bacillota_Lytic_NonSpor / Bacillota_Temp_NonSpor 2.6343 1.4700 Inf 1 1.731
Bacillota_Lytic_NonSpor / Bacillota_Temp_Spor 0.1482 0.0584 Inf 1 -4.846
Bacillota_Lytic_NonSpor / OtherPhyla_Lytic_NonSpor 1.0064 0.3520 Inf 1 0.018
Bacillota_Lytic_NonSpor / OtherPhyla_Temp_NonSpor 1.0915 0.3890 Inf 1 0.246
Bacillota_Temp_NonSpor / Bacillota_Temp_Spor 0.0563 0.0278 Inf 1 -5.830
Bacillota_Temp_NonSpor / OtherPhyla_Lytic_NonSpor 0.3820 0.1750 Inf 1 -2.097
Bacillota_Temp_NonSpor / OtherPhyla_Temp_NonSpor 0.4143 0.1920 Inf 1 -1.899
Bacillota_Temp_Spor / OtherPhyla_Lytic_NonSpor 6.7892 1.5600 Inf 1 8.346
Bacillota_Temp_Spor / OtherPhyla_Temp_NonSpor 7.3633 1.7600 Inf 1 8.342
OtherPhyla_Lytic_NonSpor / OtherPhyla_Temp_NonSpor 1.0846 0.1690 Inf 1 0.521
p.value
<.0001
<.0001
0.0434
<.0001
<.0001
0.5110
<.0001
1.0000
0.9999
<.0001
0.2888
0.4025
<.0001
<.0001
0.9954
P value adjustment: tukey method for comparing a family of 6 estimates
Tests are performed on the log odds ratio scale
prob_df <- as.data.frame(res$probabilities)
head(prob_df)
host_phage_spor prob SE df asymp.LCL asymp.UCL
Bacillota_Lytic_Spor 0.13312693 0.018902074 Inf 0.10023534 0.17471589
Bacillota_Lytic_NonSpor 0.01047730 0.003474089 Inf 0.00546026 0.02001136
Bacillota_Temp_NonSpor 0.00400320 0.001786691 Inf 0.00166722 0.00958076
Bacillota_Temp_Spor 0.06666667 0.012881224 Inf 0.04544214 0.09679919
OtherPhyla_Lytic_NonSpor 0.01041140 0.001020512 Inf 0.00859006 0.01261400
OtherPhyla_Temp_NonSpor 0.00960747 0.001142782 Inf 0.00760772 0.01212644
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
prob_df$host_phage_spor <- reorder(prob_df$host_phage_spor, prob_df$prob)
fig.meta <- merge(prob_df, meta.cats, by="host_phage_spor", all=TRUE)
pairs_df <- as.data.frame(res$pairwise_tests) %>%
mutate(
contrast_char = as.character(contrast),
p_label = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
TRUE ~ "ns"
),
group1 = sapply(strsplit(contrast_char, " / "), `[`, 1),
group2 = sapply(strsplit(contrast_char, " / "), `[`, 2)
)
pairs_df <- as.data.frame(res$pairwise_tests) %>%
mutate(
contrast_char = as.character(contrast),
p_label = case_when(
p.value > 0.05 ~ "ns", # non-significant
p.value < 0.001 ~ "p < 0.0001", # very small p-values
TRUE ~ paste0("p = ", sprintf("%.3f", p.value)) # others
),
group1 = sapply(strsplit(contrast_char, " / "), `[`, 1),
group2 = sapply(strsplit(contrast_char, " / "), `[`, 2)
)
pairs_df <- pairs_df %>%
mutate(
x_num1 = as.numeric(factor(group1, levels = levels(fig.meta$phage_type))),
x_num2 = as.numeric(factor(group2, levels = levels(fig.meta$phage_type))),
span = abs(x_num1 - x_num2),
x_pos = (x_num1 + x_num2) / 2
) %>%
arrange(span)
# -----------------------------
# 4️⃣ Compute y positions for nested brackets
offset_step <- 0.01
pairs_df <- pairs_df %>%
rowwise() %>%
mutate(
y_base = max(
fig.meta$asymp.UCL[fig.meta$host_phage_spor %in% c(group1, group2)],
na.rm = TRUE
)
) %>%
ungroup() %>%
arrange(span, desc(p_label)) %>% # shorter spans lower, ns lower
mutate(
y_pos = y_base + (row_number() - 1) * offset_step
)
# -----------------------------
# 5️⃣ Prepare comparisons list for ggsignif
comparisons_list <- lapply(1:nrow(pairs_df), function(i) {
c(as.character(pairs_df$group1[i]), as.character(pairs_df$group2[i]))
})
# -----------------------------
# 6️⃣ Plot with dodge and black brackets
pd <- position_dodge(width = 0.5)
tplot <- ggplot(fig.meta, aes(x = host_phage_spor, y = prob, color = lifestyle)) +
geom_point(size = 3, position = pd) +
geom_errorbar(aes(ymin = asymp.LCL, ymax = asymp.UCL), width = 0.2, position = pd) +
ggsignif::geom_signif(
comparisons = comparisons_list,
annotations = pairs_df$p_label,
y_position = pairs_df$y_pos,
tip_length = 0.02,
textsize = 3.5,
color = "black"
) +
ylim(0, max(pairs_df$y_pos + 0.02)) + # extend y-axis to fit top brackets
ylab("Probability of 1 or more ParS \n binding sites in phage genome") +
xlab("") +
labs(color = "Phage Lifestyle") +
theme_classic(base_size = 14)+scale_x_discrete(
labels = c(
'Lytic_Spor' = 'Lytic',
'Temp_Spor' = 'Temperate',
'Lytic_NonSpor' = 'Lytic',
'Temp_NonSpor' = 'Temperate'
)
) + theme(plot.margin = margin(10, 10, 30, 10) # large bottom margin for labels
) +
coord_cartesian(clip = "off")
tplot
spor_label <- textGrob(
"Sporulating Host", gp = gpar(fontsize = 14, fontface = "bold"),
x = 0.25, y = -0.13, just = "center"
)
nonspor_label <- textGrob(
"Non-Sporulating Host", gp = gpar(fontsize = 14, fontface = "bold"),
x = 0.75, y = -0.13, just = "center"
)
tplot_final <- tplot +
annotation_custom(spor_label) +
annotation_custom(nonspor_label)
tplot_final
NA
NA